package au.com.acpfg.misc.biojava;
import java.util.ArrayList;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.RowIterator;
import org.knime.core.data.collection.CollectionCellFactory;
import org.knime.core.data.collection.ListCell;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.JoinedRow;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
public class SNPFrameshiftDetector implements BioJavaProcessorInterface {
public SNPFrameshiftDetector(BioJavaProcessorNodeModel m, String task) {
}
@Override
public void execute(BioJavaProcessorNodeModel m, ExecutionContext exec,
NodeLogger l, BufferedDataTable[] inData, BufferedDataContainer cont)
throws Exception {
if (!m.areSequencesDNA()) {
throw new InvalidSettingsException("Only DNA sequences are supported for now!");
}
RowIterator it = inData[0].iterator();
int done = 0;
int n_rows = inData[0].getRowCount();
while (it.hasNext()) {
DataRow r = it.next();
String seq = m.getSequence(r);
int seq_len = seq.length();
DataCell[] cells = new DataCell[1];
if (seq != null && seq_len > 0) {
int[] codon_pos = new int[seq_len];
int codon_idx = 0;
for (int i=0; i<seq_len; i++) {
char c = seq.charAt(i);
if (c != 'A' && c != 'T' && c !='G' && c != 'C' ) {
if (!Character.isLetter(c))
throw new Exception("Bad char: "+ (int) c+" (encountered in row "+r.getKey()+")");
codon_pos[codon_idx++] = (i+1) % 3;
}
}
// according to Andreas, SNPs are mostly in the 3rd nucleotide per codon. If this
// holds true then a windowed-mode should yield the region of a frame shift.
// Obviously, this doesn't help much when no SNP's are available or are too sparsely
// distributed amongst the sequence
cells[0] = new IntCell(0);
StringBuffer codon_str = new StringBuffer();
for (int i=0; i<codon_idx; i++) {
codon_str.append(codon_pos[i]);
}
// now compute the mode with a window size of 3
StringBuffer mode_str = new StringBuffer();
int n_modes = 0;
for (int i=0; i<codon_str.length()-2; i++) {
ModeSummary ms = new ModeSummary(codon_str.charAt(i),
codon_str.charAt(i+1),
codon_str.charAt(i+2)
);
mode_str.append(ms.toString());
mode_str.append(", ");
n_modes++;
}
int min_percent_modes = (int) (n_modes * 0.1);
if (n_modes < 3) {
}
//cells[1] = new StringCell(mode_str.toString());
} else {
cells[0] = DataType.getMissingCell();
//cells[1] = DataType.getMissingCell();
}
cont.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), cells)));
cells = null;
done++;
if (done % 100 == 0) {
exec.checkCanceled();
exec.setProgress(((double)done)/n_rows, "Completed row "+r.getKey());
}
}
}
@Override
public DataTableSpec get_table_spec() {
DataColumnSpec[] cols = new DataColumnSpec[1];
cols[0] = new DataColumnSpecCreator("Number of detected frameshifts", IntCell.TYPE).createSpec();
//cols[1] = new DataColumnSpecCreator("Debug", StringCell.TYPE).createSpec();
return new DataTableSpec(cols);
}
@Override
public boolean isMerged() {
return true;
}
}